1. Load and Pre-processing data¶

In [1]:
pip install plotly
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: plotly in /home/yuexia/.local/lib/python3.8/site-packages (5.22.0)
Requirement already satisfied: tenacity>=6.2.0 in /home/yuexia/.local/lib/python3.8/site-packages (from plotly) (8.5.0)
Requirement already satisfied: packaging in /home/yuexia/.local/lib/python3.8/site-packages (from plotly) (23.2)

[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python3 -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
In [2]:
import numpy as np, pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from datetime import date
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')
In [3]:
# load the data and show the first five rows info
data = pd.read_csv("marketing_campaign.csv",delimiter='\t')
data.shape # (2240, 29)
data.head().style.background_gradient(cmap='Pastel2')
Out[3]:
  ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.000000 0 0 04-09-2012 58 635 88 546 172 88 88 3 8 10 4 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.000000 1 1 08-03-2014 38 11 1 6 2 1 6 2 1 1 2 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.000000 0 0 21-08-2013 26 426 49 127 111 21 42 1 8 2 10 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.000000 1 0 10-02-2014 26 11 4 20 10 3 5 2 2 0 4 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.000000 1 0 19-01-2014 94 173 43 118 46 27 15 5 5 3 6 5 0 0 0 0 0 0 3 11 0
In [4]:
# change data type with some of the column
data["Dt_Customer"] = pd.to_datetime(data['Dt_Customer'], dayfirst=True,format = '%d-%m-%Y')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2240 non-null   int64         
 1   Year_Birth           2240 non-null   int64         
 2   Education            2240 non-null   object        
 3   Marital_Status       2240 non-null   object        
 4   Income               2216 non-null   float64       
 5   Kidhome              2240 non-null   int64         
 6   Teenhome             2240 non-null   int64         
 7   Dt_Customer          2240 non-null   datetime64[ns]
 8   Recency              2240 non-null   int64         
 9   MntWines             2240 non-null   int64         
 10  MntFruits            2240 non-null   int64         
 11  MntMeatProducts      2240 non-null   int64         
 12  MntFishProducts      2240 non-null   int64         
 13  MntSweetProducts     2240 non-null   int64         
 14  MntGoldProds         2240 non-null   int64         
 15  NumDealsPurchases    2240 non-null   int64         
 16  NumWebPurchases      2240 non-null   int64         
 17  NumCatalogPurchases  2240 non-null   int64         
 18  NumStorePurchases    2240 non-null   int64         
 19  NumWebVisitsMonth    2240 non-null   int64         
 20  AcceptedCmp3         2240 non-null   int64         
 21  AcceptedCmp4         2240 non-null   int64         
 22  AcceptedCmp5         2240 non-null   int64         
 23  AcceptedCmp1         2240 non-null   int64         
 24  AcceptedCmp2         2240 non-null   int64         
 25  Complain             2240 non-null   int64         
 26  Z_CostContact        2240 non-null   int64         
 27  Z_Revenue            2240 non-null   int64         
 28  Response             2240 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(25), object(2)
memory usage: 507.6+ KB
In [5]:
# deal with missing values - drop
# data.isnull().sum() # income: 24
data=data.dropna(subset=['Income'])
data.isnull().sum()
Out[5]:
ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64
In [6]:
print(data.Education.unique())
print(data.Marital_Status.unique())
data.columns
['Graduation' 'PhD' 'Master' 'Basic' '2n Cycle']
['Single' 'Together' 'Married' 'Divorced' 'Widow' 'Alone' 'Absurd' 'YOLO']
Out[6]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')
In [7]:
# dates of the newest and oldest recorded customer
print("The newest customer's enrolment time:",max(data["Dt_Customer"]))
print("The oldest customer's enrolment time:",min(data["Dt_Customer"]))
The newest customer's enrolment time: 2014-06-29 00:00:00
The oldest customer's enrolment time: 2012-07-30 00:00:00
In [8]:
# combining similarity information columns, then dropping irrelevant columns, or creating a new feature based on existing features
data.Education = data.Education.replace(['PhD','2n Cycle','Graduation', 'Master'],'Post Graduate')  
data.Education = data.Education.replace(['Basic'], 'Under Graduate')

data.Marital_Status = data.Marital_Status.replace(['Widow','Alone','Absurd','YOLO'],"Single")
data.Marital_Status = data.Marital_Status.replace(['Together','Married','Divorced'],"Relationship")

data["Kids"] = data.Kidhome + data.Teenhome
data["All_Cmp"] = data.AcceptedCmp1 + data.AcceptedCmp2 + data.AcceptedCmp3 + data.AcceptedCmp4 + data.AcceptedCmp5
data["Age"] = 2023 - data.Year_Birth #current year 2023.
data['Spending']=data.MntWines+data.MntFruits+data.MntMeatProducts+data.MntFishProducts+data.MntSweetProducts+data.MntGoldProds

# the last date of the entire csv
last_date=date(2014,6,29)
# the number of months that a customer has stayed
data['StayMonths'] = pd.to_numeric(data['Dt_Customer'].dt.date.apply(lambda x: (last_date - x)).dt.days, downcast='integer')/30

data = data.drop(["ID","Z_CostContact","Z_Revenue","Kidhome","Teenhome","AcceptedCmp1","AcceptedCmp2","AcceptedCmp3",
                 "AcceptedCmp4","AcceptedCmp5","Year_Birth","Dt_Customer"],axis=1)
data.head().style.background_gradient(cmap='Pastel2')
Out[8]:
  Education Marital_Status Income Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth Complain Response Kids All_Cmp Age Spending StayMonths
0 Post Graduate Single 58138.000000 58 635 88 546 172 88 88 3 8 10 4 7 0 1 0 0 66 1617 22.100000
1 Post Graduate Single 46344.000000 38 11 1 6 2 1 6 2 1 1 2 5 0 0 2 0 69 27 3.766667
2 Post Graduate Relationship 71613.000000 26 426 49 127 111 21 42 1 8 2 10 4 0 0 0 0 58 776 10.400000
3 Post Graduate Relationship 26646.000000 26 11 4 20 10 3 5 2 2 0 4 6 0 0 1 0 39 53 4.633333
4 Post Graduate Relationship 58293.000000 94 173 43 118 46 27 15 5 5 3 6 5 0 0 1 0 42 422 5.366667

2. Exploratory data and visualisation¶

In [9]:
# plot1 
fig, axes = plt.subplots(1, 3, figsize=(10, 4))

sns.countplot(x='Education', data=data, palette='Set2', ax=axes[0])
axes[0].set_title('Frequency of Education Situation')

sns.countplot(x='Marital_Status',data=data,ax=axes[1])
axes[1].set_title('Marital Status')

sns.countplot(x='Kids',data=data,palette='Set3',ax=axes[2])
plt.title("Frequency Of Each Category\n in the Kids Variable")

plt.tight_layout()
plt.show()
No description has been provided for this image

Findings:¶

  1. Education: I reset 'PhD', '2n Cycle',' Graduation', and 'Master' to "Post Graduate", and the "basic" is "Under Graduate". This dataset contains most of the postgraduate customers, over 2,000. Whereas just a few undergraduates, even though they are less than 250.
  2. Marital Status: I reset 'Widow', 'Alone', 'Absurd', and 'YOLO' as "Single" variables; the other is "Relationship". The people quantity is the difference around 1,000.
  3. Kids: I created a new column named "Kids", which concatenates "Kidhome" and "Teenhome". From the plot, we can see around 600 customers do not have kids; the highest number of people have only one kid, and only a few have more than three kids.
In [10]:
# plot2 average income in box plot
plt.figure(figsize=(6,4))
data.Income.plot.box(color ='turquoise')
plt.show()
No description has been provided for this image

Finding:¶

according to plot2, the box plot shows the income values contain outliers, which are above $600,000, so let's remove them

In [11]:
# deal with outliers
data = data[data["Income"] < 600000]
data.describe().style.background_gradient(cmap='Pastel2')
Out[11]:
  Income Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth Complain Response Kids All_Cmp Age Spending StayMonths
count 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000 2215.000000
mean 51969.861400 49.024379 305.225282 26.361625 167.063205 37.651016 27.040632 43.979684 2.322799 4.085779 2.671783 5.802257 5.318736 0.009481 0.150339 0.947178 0.298420 54.183296 607.321445 11.783461
std 21526.320095 28.949608 337.345380 39.802036 224.311559 54.760822 41.077594 51.822660 1.923820 2.741473 2.927179 3.250974 2.425863 0.096929 0.357484 0.749230 0.679332 11.987000 602.925291 6.749291
min 1730.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 27.000000 5.000000 0.000000
25% 35284.000000 24.000000 24.000000 2.000000 16.000000 3.000000 1.000000 9.000000 1.000000 2.000000 0.000000 3.000000 3.000000 0.000000 0.000000 0.000000 0.000000 46.000000 69.000000 6.000000
50% 51373.000000 49.000000 175.000000 8.000000 68.000000 12.000000 8.000000 25.000000 2.000000 4.000000 2.000000 5.000000 6.000000 0.000000 0.000000 1.000000 0.000000 53.000000 397.000000 11.833333
75% 68487.000000 74.000000 505.000000 33.000000 232.500000 50.000000 33.000000 56.000000 3.000000 6.000000 4.000000 8.000000 7.000000 0.000000 0.000000 1.000000 0.000000 64.000000 1048.000000 17.633333
max 162397.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 262.000000 321.000000 15.000000 27.000000 28.000000 13.000000 20.000000 1.000000 1.000000 3.000000 4.000000 130.000000 2525.000000 23.300000
In [12]:
# plot3 average income
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.distplot(data.Income,color ='green',ax=axes[0])
sns.distplot(data.Spending,color ='orange', ax=axes[1])
plt.tight_layout()
plt.show()
No description has been provided for this image

Finding:¶

  • after removing the outliers, the income variable appears bell curve. It is for later GMM method.
  • Spending variable comes from the customers spending money on wine, fruits, meat, fish, sweets, and gold. It looks like skewed data.
In [13]:
# correlation matrix 
# Selecting only numeric columns from the DataFrame
numeric_data = data.select_dtypes(include=['number'])

# Computing the correlation matrix
corrmat = numeric_data.corr()

# Plotting the heatmap
plt.figure(figsize=(18,18)) 
sns.heatmap(corrmat, annot=True, center=0, cmap="Set2")
plt.show()
No description has been provided for this image

3. Data Normalisation¶

Take 3 primany features ('Income','StayMonths','Spending') from the dataset for GMM clustering.

In [14]:
data_new = data[['Income','StayMonths','Spending']]
# standardise the numerical variables
column= data_new.columns
scaler = preprocessing.StandardScaler()
data_new_standard= scaler.fit_transform(data_new)

data_new_normalized = preprocessing.normalize(data_new_standard,norm='l2') 
data_new_normalized
Out[14]:
array([[ 0.12537976,  0.66883499,  0.73276168],
       [-0.16850324, -0.76582868, -0.6205748 ],
       [ 0.93476116, -0.2099754 ,  0.28658664],
       ...,
       [ 0.15988431, -0.6733312 ,  0.72184631],
       [ 0.60693959, -0.73771711,  0.29563121],
       [ 0.02765358,  0.87791651, -0.47801453]])

Identify the number of clusters using Silhouette Score¶

In [15]:
# silhouette method 
scores = []
for i in range(2, 15):
    GMM=GaussianMixture(n_components=i,random_state=42)
    clusters = GMM.fit_predict(data_new_normalized)
    scores.append(metrics.silhouette_score(data_new_normalized, clusters, metric='euclidean'))

plt.figure(figsize=(8,4))
plt.plot(range(2, 15), scores, 'bo-', color='purple')
plt.xlabel('clusters')
plt.ylabel('Silhouette Score')
plt.title('Identify the number of clusters using Silhouette Score')
plt.show()
No description has been provided for this image

Finding:¶

  • The n_components = 4 is highest silhouette score, which is 0.47. So, I will plot the 4 clusters in next step.
  • The n_components = 2 has silhouette score 0.46.
  • Others n components I do not consider.

4. Clustering Algorithm - GaussianMixture¶

In [16]:
# apply GaussianMixture clustering method with n_components=4
GMM=GaussianMixture(n_components=4,random_state=42).fit(data_new_normalized)
cluster_predicted = GMM.predict(data_new_normalized)

# plot 
plt.figure(figsize=(6,6))
plt.scatter(data_new_normalized[:, 0], data_new_normalized[:, 1], c=cluster_predicted, cmap='Set2');
No description has been provided for this image

Finding:¶

The plot shows that the data have been nicely grouped into four groups.

In [17]:
# added cluster_predicted value into data_new data frame
data_new["Cluster"] = cluster_predicted
data_new.head().style.background_gradient(cmap='Pastel2')
Out[17]:
  Income StayMonths Spending Cluster
0 58138.000000 22.100000 1617 1
1 46344.000000 3.766667 27 0
2 71613.000000 10.400000 776 3
3 26646.000000 4.633333 53 0
4 58293.000000 5.366667 422 3

4. Summary¶

In [18]:
summary = data_new.groupby('Cluster').agg({'mean'})

# Visualisasi 
plt.figure(figsize=(15, 4))
for index, variable in enumerate(data_new.columns):
    if index > 2:
        break
    plt.subplot(1,3,index+1)
    sns.barplot(x = summary.reset_index().Cluster, y = summary[variable]['mean'],palette='Set2')
    plt.ylabel(variable, fontsize=10)
    plt.xlabel('Cluster', fontsize=10)  
plt.show()
No description has been provided for this image

Insights¶

The dataset has following 4 clusters, which are 0,1,2 and 3.

  • Cluster 0: new customers, low income, small spending ("Attention")
  • Cluster 1: new customers, high income, high spending ("Target Customer")
  • Cluster 2: old customers, low income, small spending ("Not interested")
  • Cluster 3: old customers, high income, high spending ("Loyal High Spending")

Visualisation for Insights¶

In [19]:
# Visualisation
data_new=data_new.replace({0:'Attention',1:'Target Customer',2:'Not interested',3:'Loyal High Spending'})
PLOT = go.Figure()
colors = ['orange', 'pink', 'skyblue', 'lightgrey']
color_index = 0
for C in list(data_new.Cluster.unique()):
    PLOT.add_trace(go.Scatter3d(x = data_new[data_new.Cluster == C]['Income'],
                                y = data_new[data_new.Cluster == C]['StayMonths'],
                                z = data_new[data_new.Cluster == C]['Spending'],                        
                                mode = 'markers',marker_size = 6, marker_line_width = 1,
                                marker_color=colors[color_index],
                                name = str(C)))
    color_index += 1

PLOT.update_traces(hovertemplate='Income: %{x} <br>StayMonths: %{y} <br>Spending: %{z}')

PLOT.update_layout(width = 850, height = 850, autosize = True, showlegend = True,
                   scene = dict(xaxis=dict(title = 'Income', titlefont_color = 'black'),
                                yaxis=dict(title = 'StayMonths', titlefont_color = 'black'),
                                zaxis=dict(title = 'Spending', titlefont_color = 'black')),
                   font = dict(family = "Gilroy", color  = 'black', size = 12))
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: